Exploring the Wide World of R

Hans Lehndorff

Me and R

I’ve used R in both professional and academic settings over the last 6 years1.

There is a lot that R can do. While there are many things I enjoy doing in R, this presentation is about one of my favorite applications – data visualization.

Show code
df <- data.frame(
  x=c(0, 10), 
  y=c(1, 1), 
  r = c(8,7),
  who=c("Things R can do","Things I know how to do")
  )

#create scatter plot with circle
ggplot(data = df) +
  geom_circle(aes(x0=x,y0=y,r=r,fill=who),alpha=.5)+
  geom_circle(aes(x0=x,y0=y,r=r))+
  theme_void()+
  theme(legend.position = "bottom")+
  labs(fill=NULL)+
  scale_fill_brewer(type="qual",palette = 6)+
  coord_fixed()+
  guides(fill=guide_legend(reverse = T))

Data Visualization

Overview

  • My aim with data visualization is to make my analysis in R more accessible.

  • This is most often done via the ggplot2 package in R.

  • _

Charts

Here are a few examples of ggplot2 visualizations I’ve made.

Show code
squirrels %>% 
  ggplot()+
  geom_bin2d(aes(y=lat,x=long))+
  theme_minimal()+
  coord_map()+
  labs(
    x=NULL,y=NULL,
    fill="Count of\nSquirrels",subtitle="Heat Map of Squirrel Sightings in Central Park",title='Squirrel Town')+
  theme(
    legend.position = 'right',
    plot.caption = element_text(hjust = 0)
    )

See original analysis

Charts

Here are a few examples of ggplot2 visualizations I’ve made.

Show code
plot_data<-final_dataset %>% 
  filter(circuit_label_type_desc=="Stove/Oven/Range") %>% 
  filter(doy<350) %>% 
  group_by(date,date_2020) %>% 
  summarise(kwh=sum(kwh)) %>% 
  arrange(date) %>% 
  group_by(year(date)) %>% 
  mutate(sma=zoo::rollmean(kwh,30,na.pad = T,align = 'right'))

p2<-ggplot(plot_data)+
  geom_smooth(aes(x=date_2020,y=kwh,color=factor(year(date))),linewidth=1.5,method = 'loess',span=.17,se=F)+
  geom_hline(yintercept = 0,color=NA)+
  geom_vline(xintercept = as.Date("2020-03-08"))+
  labs(x="Day of the Year",y="Smoothed Daily Energy Usage (kWh)",color="Time Period",
       title="Comparison of Stove Energy Usage by Day of Year", subtitle = "Stove usage increased dramatically when the Pandemic started")+
  scale_color_brewer(type = "qual",palette =  6)+
  scale_x_date(date_breaks = "1 month",date_labels = "%b",limits = c(as.Date("2020-01-01"),as.Date("2020-09-01")))+
  theme_minimal()+
  theme(
    legend.position = 'bottom',
    panel.grid.minor.x = element_blank()
    )
p2

See original analysis

Charts

Here are a few examples of ggplot2 visualizations I’ve made.

Show code
agg_dat<-wwc_outcomes %>% 
  group_by(team,round) %>% 
  summarise(n=n(),mean=mean(score)) %>% 
  filter(round!="Third Place Playoff") %>% 
  filter("Final"%in%round) %>% 
  group_by(team) %>% 
  filter(n[round=="Final"]>1)

agg_dat$round<-factor(agg_dat$round,levels = c("Group","Round of 16","Quarter Final","Semi Final","Final"))

h2h<-wwc_outcomes %>% 
  filter(team%in%agg_dat$team) %>% 
  group_by(year,yearly_game_id) %>% 
  filter(n()==2)

output<-NULL
for(i in unique(h2h$team)){
  this_summary<-h2h %>% 
    group_by(year,yearly_game_id) %>% 
    filter(i%in%team) %>%
    group_by(this_team=i,team) %>% 
    summarise(
      w=sum(win_status=="Won"),
      l=sum(win_status=="Lost"),
      d=sum(win_status=="Tie")) %>% 
    mutate(
      pct=l/(w+l)
    )
  output<-bind_rows(
    this_summary,
    output
  )
  
}

final_summary<-wwc_outcomes %>% 
  filter(round=="Final") %>% 
  group_by(team) %>% 
  summarise(apps=n(),first=min(year),goals=sum(score)) %>% 
  arrange(-goals)

p3<-output %>% 
  left_join(codes) %>% 
  filter(this_team!=team) %>% 
  ggplot()+
  geom_col(aes(x=country,y=pct,fill=country))+
  theme_minimal()+
  theme(
    legend.position = 'bottom',
    strip.background = element_rect(fill="gray80"),
    strip.placement = 'left',
    strip.text.y.left = element_text(angle = 0))+
  scale_y_continuous(labels = scales::percent,expand = c(0,0,.05,0))+
  scale_fill_manual(values=rev(c("black","#00205B","#BC002D","#FFCC00")))+
  coord_flip()+
  facet_grid(paste(this_team,"V.")~.,scale="free_y",switch = "y")+
  guides(fill='none')+
  labs(x=NULL,y="Winning Percentage (excluding draws)",
       title="Better v. Best",subtitle = "The United States Women's National Soccer team wins 75% of its matches against other top teams")

p3

See original analysis

Tables

These visualization can even show up in tables.

Show code
initial_table<-final_dataset %>% 
  filter(doy<350&doy>yday("2020-03-15")) %>% 
  group_by(circuit_label_type_desc,date) %>% 
  summarise(kwh=sum(kwh)) %>% 
  group_by(circuit_label_type_desc,Year=year(date)) %>% 
  summarise(
    kWh=round(mean(kwh),3)
    ) %>% 
  pivot_wider(names_from = Year,values_from = kWh) %>% 
  mutate(`Percent Change`=round((`2020`-`2019`)/`2019`,4)) %>% 
  arrange(-abs((`2020`-`2019`)/`2019`)) %>% 
  mutate(Histogram=NA) %>% 
  ungroup()

for(eu in initial_table$circuit_label_type_desc){
  # print(eu)
  plot<-final_dataset %>% 
    filter(doy<350&doy>yday("2020-03-15")) %>% 
    filter(circuit_label_type_desc==eu) %>% 
    group_by(circuit_label_type_desc,date,year=factor(year(date))) %>% 
    summarise(kwh=sum(kwh)) %>% 
    ggplot()+
    geom_density(aes(x=kwh,color=year,fill=year),alpha=.4)+
    scale_color_brewer(type = "qual",palette =  "Set1")+
    scale_fill_brewer(type = "qual",palette =  "Set1")+
    scale_y_continuous(breaks = NULL)+
    theme_minimal()+
    theme(legend.position = 'bottom')+
    labs(x="Daily kWh",y=NULL,fill="Year",color="Year")
  
  initial_table$Histogram[initial_table$circuit_label_type_desc==eu]<-list(plot)
}


output_table <- initial_table %>%
  rename(
    "2019 Average Daily Usage" = `2019`,
    "2020 Average Daily Usage" = `2020`,
    "End Use"=circuit_label_type_desc) %>% 
gt() %>%
text_transform(
 locations = cells_body(Histogram),
 fn = function(Histogram) {
   map(initial_table$Histogram, ggplot_image, height = px(150))
 }
) %>% 
  fmt_percent(columns = `Percent Change`) %>% 
  tab_options(
    table.width = 1
  ) %>% 
  gtExtras::gt_color_rows(
    columns = `Percent Change`,
    palette = c("#E41A1C","white", "#377EB8"),
    direction = 1,
    domain = c(-1,1),
    pal_type = "continuous"
    )

# brewer.pal(2,"Set1")
output_table
End Use 2019 Average Daily Usage 2020 Average Daily Usage Percent Change Histogram
Electric Vehicle Charger 2.405 1.173 −51.23%
Stove/Oven/Range 0.178 0.225 26.40%
Ductless Heatpump 0.676 0.837 23.82%
Electric Furnace 0.696 0.820 17.82%
Ducted Heatpump 1.501 1.650 9.93%
Clothes Washer 0.051 0.046 −9.80%
Other 1.731 1.900 9.76%
Clothes Dryer 0.477 0.438 −8.18%
Electric Resistance Storage Water Heaters 1.849 1.991 7.68%
Mains 6.813 7.314 7.35%
Central AC 1.162 1.227 5.59%
Refrigerator/Freezer 0.451 0.476 5.54%
Hot Tub 2.107 1.997 −5.22%
Dishwasher 0.097 0.102 5.15%

Interactivity

Or be interactive.

Show code
sites_with_mains<-subset(points,circuit_label_type_desc=="Mains")$ee_site_id
these_eus<-c(
  "Mains","Hot Tub","Electric Resistance Storage Water Heaters",
  "Central AC", "Clothes Washer", "Clothes Dryer","Refrigerator/Freezer",
  "Other","Stove/Oven/Range","Electric Vehicle Charger","Dishwasher",
  "Ductless Heatpump","Electric Furnace","Ducted Heatpump"
  )

#Step 1 drop incomplete circuits
circuit_agg<-analysis_data %>% 
  filter(ee_site_id%in%sites_with_mains) %>% 
  group_by(ee_site_id,regname) %>% 
  summarise(n=n())

retain_circuits<-circuit_agg %>% 
  filter(n>12755)

analysis_data<-analysis_data %>% 
  inner_join(retain_circuits %>% select(-n))

#Step 3 drop errant days
day_agg<-analysis_data %>% 
  group_by(date,hour) %>% 
  summarise(n=n())

retain_hours<-day_agg %>% 
  filter(n>625) %>% 
  filter(!date%in%as.Date(c("2019-09-25","2019-09-26")))

analysis_data<-analysis_data %>% 
  inner_join(retain_hours %>% select(-n))

weather_stations <- read_csv("data/NEEA HEMS/weather_stations_geocodio.csv",name_repair = 'universal') %>% 
  mutate(station_id=paste(USAFID,WBAN,sep="-")) %>% 
  mutate(State=case_when(
    State=="OR"~"Oregon",
    State=="WA"~"Washington",
    State=="ID"~"Idaho",
    State=="MT"~"Montana"
  ),
  County=gsub(" County","",County,fixed = T))

state_maps<-map_data("state",c("oregon","washington","idaho","montana"))
county_maps<-map_data("county",c("oregon","washington","idaho","montana"))

sites_ws<-sites %>% 
  filter(ee_site_id%in%analysis_data$ee_site_id) %>% 
  left_join(
    weather_stations %>% 
      group_by(station_id) %>% 
      filter(1:n()==1)
  ) 

data_for_map<-analysis_data %>% 
  left_join(sites_ws %>% select(ee_site_id,County,State,LAT,LONG)) %>% 
  group_by(County,State) %>% 
  summarise(sites=n_distinct(ee_site_id), data=n(),LAT=first(LAT),LONG=first(LONG))

county_maps<-county_maps %>% 
  mutate(across(region:subregion,stringr::str_to_title)) %>% 
  left_join(data_for_map,by=c("region"="State","subregion"="County"))

city_labs<-maps::us.cities %>% 
  filter(country.etc%in%c("OR","ID","WA","MT")) %>% 
  filter(name%in%c("Seattle WA","Bend OR","Portland OR","Boise ID","Spokane WA","Missoula MT")) %>% 
  mutate(lat=ifelse(name%in%c("Bend OR"),lat-.5,lat+.5),
         long=ifelse(name%in%c("Seattle WA"),long-2,long))

p1<-ggplot()+
  geom_polygon(data=state_maps,aes(x=long,y=lat,group=group),color='black',fill=NA)+
  geom_polygon_interactive(data=county_maps,aes(x=long,y=lat,group=group,fill=data,tooltip = paste(subregion,region), data_id = paste(subregion,region)),color=NA,size=.1)+
  # geom_point(data=data_for_map,aes(x=LONG,y=LAT,size=data,fill=sites),color="black",shape=21,alpha=.5)+
  geom_label(data=city_labs,aes(x=long,y=lat,label=name))+
  coord_map(xlim = c(-127,-110))+
  theme_void()+
  scale_size_continuous(labels = scales::comma)+
  scale_fill_viridis_c(na.value = "white",labels=scales::comma)+
  theme(
    legend.key.width=unit(.4,"in"),
    legend.position = 'bottom',
    legend.box= 'horizontal')+
  labs(fill="Number of Observations",size="Total Observations",
       title="Geographic Distrobution of NEEA HEMS Data")+
   guides(fill = guide_colourbar(title.position="top", title.hjust = 0.5),
         size = guide_legend(title.position="top", title.hjust = 0.5))

eu_summary<-analysis_data %>% 
  left_join(points %>% select(ee_site_id,regname,circuit_label_type_desc)) %>% 
  left_join(sites_ws %>% select(ee_site_id,region=State,subregion=County)) %>% 
  group_by(region,subregion,circuit_label_type_desc) %>% 
  summarise(
    n=n()
  ) %>% 
  filter(!circuit_label_type_desc%in%c("Other","Mains","Gas Furnace (Component)")) %>% 
  mutate(
    category=case_when(
      circuit_label_type_desc%in%c("Central AC","Ducted Heatpump","Ductless Heatpump","Electric Baseboard Heaters","Other Zonal Heat","Electric Furnace") ~ "HVAC",
      grepl("water heater",circuit_label_type_desc,ignore.case = T) ~ "Water\nHeating",
      circuit_label_type_desc%in%c("Stove/Oven/Range","Microwave","Garbage Disposal","Dishwasher","Refrigerator/Freezer") ~ "Kitchen",
      TRUE~"Other"
    )
  )

p2<-ggplot(eu_summary)+
  geom_col_interactive(aes(x=n,y=forcats::fct_reorder(abbreviate(circuit_label_type_desc,30),n),fill=category,tooltip = paste(subregion,region), data_id = paste(subregion,region)))+
  scale_x_continuous(labels = scales::comma,expand=c(0,0,.05,0))+
  scale_fill_brewer(type="qual",palette = 2)+
  theme_classic()+
  facet_grid(category~.,scale="free_y",space="free")+
  labs(x="Number of Observations",y="Equipment Type",fill="Equipment Category",
       title="Appliance Distrobution of NEEA HEMS Data",
       subtitle = "Data available for range of household appliances")+
  theme(
    legend.position = 'bottom',
    strip.text.y = element_text(angle = 0),
    title = element_text(size=9))+
  guides(fill='none')

a <- girafe(code = print(p1 + p2 + plot_layout(widths = c(.65,.35))))

a

Animation

Or even animated.

Show code
# tree_plot<-ggplot(tree_dat)+
  # geom_bubble(aes(x0=x,y0=y,r=t,fill=factor(tree)),alpha=1)+
  # scale_fill_manual(values = tree_dat$color)+
  # coord_fixed(xlim = x_range,ylim = y_range)+
  # guides(fill='none')+
  # theme_void()+
  # transition_states(year)+
  # ease_aes("linear")

# animate(tree_plot,nframes = years*2,duration = years/10,end_pause = 10,renderer = gifski_renderer())  

See original analysis

Show code
library(gganimate)
library(tidyverse)

load("data/swift_data.rdata")

birds_plot<-ggplot(all_birds2 %>% arrange(iteration) %>% filter(bird>0))+
  # geom_point(aes(x=x,y=y,color=factor(bird),group=factor(bird),alpha=iteration))+
  geom_spoke(aes(x=x,y=y,color=(bird),group=factor(bird),angle=a2),radius=1,arrow=arrow(type = "closed",length = unit(3/range,"inches")))+
  guides(color='none')+
  scale_color_viridis_b()+
  coord_equal()+
  transition_states(iteration)+
  ease_aes("linear")+
  theme_void()+
  # shadow_wake(wake_length = 1)+
  theme(axis.ticks = element_blank(),
        axis.title = element_blank(),
        axis.text = element_blank())+
  ggtitle("Moment {closest_state}")

animate(birds_plot,nframes = iterations*2,duration = iterations/10,end_pause = 10,renderer = gifski_renderer())  

See original analysis

Even my website logo is generated using ggplot2.

Show code
start<-runif(3)
amount<-10000
margins=.4

grid<-data.frame(xin=runif(amount,-1,1),yin=runif(amount,-1,1),size=runif(amount,-1,1),group=round(runif(amount,1,100))) %>% 
  group_by(group) %>% 
  mutate(
    order=1:n(),
    r=rep(start[1]+runif(1,start[1]*-margins,start[1]*margins),n()),
    g=rep(start[2]+runif(1,start[2]*-margins,start[2]*margins),n()),
    b=rep(start[3]+runif(1,start[3]*-margins,start[3]*margins),n())
    ) %>% 
  ungroup() %>% 
  mutate(across(r:b,function(x){
    x[x<0]<-0
    x[x>1]<-1
    x
  })) %>% 
  mutate(color=rgb(r,g,b)) %>% 
  arrange(order)

#### V1 ####
# plot<-ggplot(grid)+
#   geom_rect(xmin=-1.2,xmax=1.2,ymax=1.2,ymin=-1.2,fill="white")+
#   geom_path(aes(x=xin,y=yin,group=group,color=color))+
#   theme_void()+
#   guides(color='none')+
#   scale_color_manual(values = unique(grid$color))+
#   geom_rect(xmin=-.25,xmax=.25,ymax=1,ymin=.25,fill="white")+
#   geom_rect(xmin=-.25,xmax=.25,ymax=-.25,ymin=-1,fill="white")

#### V2 ####
plot<-grid %>% 
  filter(!(between(yin,.25,1)&between(xin,-.25,.25))) %>% 
  filter(!(between(yin,-1,-.25)&between(xin,-.25,.25))) %>% 
  ggplot()+
  geom_point(shape="H",aes(x=xin,y=yin,color=color,size=size))+
  scale_size_continuous(range = c(4,12))+
  theme_void()+
  guides(color='none',size='none')+
  scale_color_manual(values = unique(grid$color))

Thanks!

Please get in touch with me via my website!